Interdisciplinary Project- Reef 4

Datasets Loading

Code
# install.packages("cvTools", repos = "https://cran.rstudio.com/")
# install.packages("viridis")
# install.packages("sf", dependencies = TRUE)
# install.packages("ncdf4")
# install.packages("ggthemes")
# install.packages("GGally")
# install.packages("FNN", dependencies=TRUE) 
# install.packages("ggspatial")
# install.packages("mapdata")
# install.packages("corrplot")
# install.packages("ggcorrplot") 
# install.packages(c("ordinal", "rpart", "randomForest", "e1071", "caret", "dplyr"))
# install.packages("VGAM")
# install.packages("rpart")
# install.packages("party")
# install.packages("partykit")

Bleaching survey, Reef check, and Ereefs datasets

Code
bleaching_surveys <- read.csv("bleachingSurveys-1.csv")
reef_check <- read.csv("Reef_Check_with_cortad_variables_with_annual_rate_of_SST_change.csv")

eReefs_nc = ncdf4::nc_open("eReefs_assessment.nc")

str(bleaching_surveys)
summary(bleaching_surveys)

str(reef_check)
summary(reef_check)
Code
names(eReefs_nc$var)
Code
file_path <- "/Users/cuteemma/desktop/data3888/Assessment 1/A1- Reef/eReefs_assessment.nc"
ereefs_nc <- nc_open(file_path)

lat <- ncvar_get(ereefs_nc, "latitude")
lon <- ncvar_get(ereefs_nc, "longitude")
temperature <- ncvar_get(ereefs_nc, "temp")
total_nitrogen <- ncvar_get(ereefs_nc, "TOTAL_NITROGEN")
ph <- ncvar_get(ereefs_nc, "PH")
salt <- ncvar_get(ereefs_nc, "salt")
nc_close(ereefs_nc)

ereefs_data <- data.frame(
  Latitude = as.vector(lat),
  Longitude = as.vector(lon),
  Temperature = as.vector(temperature),
  Total_Nitrogen = as.vector(total_nitrogen),
  pH = as.vector(ph),
  Salt = as.vector(salt)
)

head(bleaching_surveys)
head(reef_check)
Code
eReefs_nc <- ncdf4::nc_open("eReefs_assessment.nc")

lat = ncdf4::ncvar_get(eReefs_nc, "latitude")
long = ncdf4::ncvar_get(eReefs_nc, "longitude")

time = ncdf4::ncvar_get(eReefs_nc, "time")
tunits = ncdf4::ncatt_get(eReefs_nc, "time", "units")
cf = CFtime::CFtime(tunits$value, calendar = "standard", time)  
timestamps = CFtime::as_timestamp(cf)  
timestamps = as.Date(timestamps, format = "%Y-%m-%d")  

# explanatory variables 
temp = ncdf4::ncvar_get(eReefs_nc, "temp")
salt = ncdf4::ncvar_get(eReefs_nc, "salt")
total_nitrogen = ncdf4::ncvar_get(eReefs_nc, "TOTAL_NITROGEN")
ph = ncdf4::ncvar_get(eReefs_nc, "PH")
macroalgae = ncdf4::ncvar_get(eReefs_nc, "MA_N")

# convert extracted data into a data frame
eReefs_df = expand.grid(long = long, lat = lat, time = timestamps) %>%
    mutate(
        temp = as.vector(temp),
        salt = as.vector(salt),
        total_nitrogen = as.vector(total_nitrogen),
        ph = as.vector(ph),
        macroalgae = as.vector(macroalgae)
    )

# remove rows where all environmental variables are NA
eReefs_df = eReefs_df %>% filter(!is.na(temp) | !is.na(salt) | !is.na(total_nitrogen) | !is.na(ph))

ncdf4::nc_close(eReefs_nc)
head(eReefs_df)

Itmp & manta dataset

Code
ltmp_data <- read_csv("ltmp_hc_sc_a_by_site.csv")
Rows: 18540 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): SECTOR, SHELF, REEF_NAME, REEF_ID, GROUP_CODE
dbl  (6): SITE_NO, LATITUDE, LONGITUDE, VISIT_NO, YEAR_CODE, COVER
date (1): SAMPLE_DATE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
manta_data <- read.csv("manta-tow-by-reef.csv")

manta_data$SAMPLE_DATE <- as.Date(manta_data$SAMPLE_DATE)
ltmp_data$SAMPLE_DATE <- as.Date(ltmp_data$SAMPLE_DATE)

Datasets filtering and cleaning

Code
# filter & clean Bleaching Surveys 
bleaching_surveys_filtered <- bleaching_surveys %>%
  mutate(sample_date = as.Date(surveyDate),
         year = year(sample_date)) %>%
  filter(
    year >= 2015 & year <= 2017,
    latitude >= -24 & latitude <= -9,
    longitude >= 142 & longitude <= 155
  ) %>%
  mutate(
    Zone = case_when(
      latitude <= -9 & latitude > -16 ~ "Northern GBR",
      latitude <= -16 & latitude > -21 ~ "Central GBR",
      latitude <= -21 & latitude >= -24 ~ "Southern GBR",
      TRUE ~ NA_character_
    )
  )

# filter & clean LTMP dataset
ltmp_data_filtered <- ltmp_data %>%
  filter(
    SAMPLE_DATE >= as.Date("2015-01-01") & SAMPLE_DATE <= as.Date("2017-12-31"),
    LATITUDE >= -24 & LATITUDE <= -9,
    LONGITUDE >= 142 & LONGITUDE <= 155
  ) %>%
  mutate(
    Zone = case_when(
      LATITUDE <= -9 & LATITUDE > -16 ~ "Northern GBR",
      LATITUDE <= -16 & LATITUDE > -21 ~ "Central GBR",
      LATITUDE <= -21 & LATITUDE >= -24 ~ "Southern GBR",
      TRUE ~ NA_character_
    )
  )

# filter & clean MANTA dataset 
manta_data_filtered <- manta_data %>%
  filter(
    SAMPLE_DATE >= as.Date("2015-01-01") & SAMPLE_DATE <= as.Date("2017-12-31"),
    LATITUDE >= -24 & LATITUDE <= -9,
    LONGITUDE >= 142 & LONGITUDE <= 155
  ) %>%
  mutate(
    Zone = case_when(
      LATITUDE <= -9 & LATITUDE > -16 ~ "Northern GBR",
      LATITUDE <= -16 & LATITUDE > -21 ~ "Central GBR",
      LATITUDE <= -21 & LATITUDE >= -24 ~ "Southern GBR",
      TRUE ~ NA_character_
    )
  )
Code
reef_check_filtered <- reef_check %>%
  filter(
    Year >= 2015 & Year <= 2017,
    Latitude.Degrees >= -24 & Latitude.Degrees <= -9,
    Longitude.Degrees >= 142 & Longitude.Degrees <= 155
  ) %>%
  mutate(
    Zone = case_when(
      Latitude.Degrees <= -9 & Latitude.Degrees > -16 ~ "Northern GBR",
      Latitude.Degrees <= -16 & Latitude.Degrees > -21 ~ "Central GBR",
      Latitude.Degrees <= -21 & Latitude.Degrees >= -24 ~ "Southern GBR",
      TRUE ~ NA_character_
    )
  )
Code
dim(bleaching_surveys_filtered)
[1] 7608   12
Code
dim(ltmp_data_filtered)
[1] 1764   13
Code
dim(manta_data_filtered)
[1] 194  23
Code
dim(reef_check_filtered)
[1] 31 56
Code
# filter eReefs data to GBR region and 2015–2017
ereefs_data_filtered <- eReefs_df %>%
  filter(
    lat >= -24 & lat <= -9,              
    long >= 142 & long <= 155,           
    time >= as.Date("2015-01-01") & time <= as.Date("2017-12-31")  
  ) %>%
  mutate(
    Year = lubridate::year(time),
    Zone = case_when(
      lat <= -9 & lat > -16 ~ "Northern GBR",
      lat <= -16 & lat > -21 ~ "Central GBR",
      lat <= -21 & lat >= -24 ~ "Southern GBR",
      TRUE ~ NA_character_
    )
  )

dim(ereefs_data_filtered)
head(ereefs_data_filtered)
Code
dim(ereefs_data_filtered)
[1] 267834     10
Code
dim(bleaching_surveys_filtered)
[1] 7608   12
Code
dim(ltmp_data_filtered)
[1] 1764   13

EDA

Distribution of Bleaching Severity (Overall)

Code
ggplot(bleaching_surveys_filtered, aes(x = factor(bleachCat))) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribution of Bleaching Severity",
       x = "Bleaching Category", y = "Count") +
  theme_minimal()

Mean Environmental Variables by Zone

Code
ereefs_data_filtered %>%
  group_by(Zone) %>%
  summarise(mean_temp = mean(temp, na.rm = TRUE)) %>%
  ggplot(aes(x = Zone, y = mean_temp, fill = Zone)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Mean Sea Surface Temperature by GBR Zone", y = "SST (°C)", x = "") +
  theme_minimal()

Coral Cover by Zone (LTMP)

Code
ggplot(ltmp_data_filtered, aes(x = Zone, y = COVER, fill = Zone)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Distribution of Coral Cover by Zone", y = "Coral Cover (%)") +
  theme_minimal()

Exploratory Visualizations

Map of Survey Locations

Code
# australia coastline
oz <- map_data("worldHires", region = "Australia")

p_map <- ggplot() +
  geom_polygon(data = oz, aes(x = long, y = lat, group = group), fill = "gray90", color = NA) +
  
  # bleaching data
  geom_point(data = bleaching_surveys_filtered,
             aes(x = longitude, y = latitude, color = bleachCat),
             alpha = 0.8, size = 2.5) +
  
  # horizontal red lines separating zones
  geom_hline(yintercept = c(-16, -21), color = "red", linetype = "dashed", size = 1) +
  
  # color scale: higher = brighter, lower = darker (standard viridis)
  scale_color_viridis_c(name = "Bleaching Category", option = "viridis") +
  
  coord_fixed(xlim = c(142, 155), ylim = c(-25, -9)) +
  
  labs(title = "Coral Bleaching Severity (2015–2017) with GBR Zone Boundaries",
       x = "Longitude", y = "Latitude") +
  
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold", hjust = 0.5),
    panel.grid = element_line(color = "gray90")
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Code
p_map

Code
ggsave("bleaching_map_gbr_zones_2015_2017.pdf", plot = p_map, width = 10, height = 6)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Coral Bleaching Severity (2015–2017) with GBR Zone Boundaries' in
'mbcsToSbcs': - substituted for – (U+2013)

Bleaching by Year and Zone

Code
p_prop <- ggplot(bleaching_surveys_filtered, aes(x = factor(year), fill = factor(bleachCat))) +
  geom_bar(position = "fill") +
  facet_wrap(~Zone) +
  labs(
    title = "Proportion of Bleaching Severity by Year and Zone (2015–2017)",
    y = "Proportion",
    x = "Year",
    fill = "Bleaching Severity"
  ) +
  scale_fill_viridis_d(option = "viridis", name = "Bleaching Category") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "right"
  )

p_prop

Code
ggsave("proportion_bleaching_by_zone_year_2015_2017.pdf", plot = p_prop, width = 10, height = 6)
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Proportion of Bleaching Severity by Year and Zone (2015–2017)' in
'mbcsToSbcs': - substituted for – (U+2013)

Dataset merging

Code
bleaching_surveys_filtered <- bleaching_surveys_filtered %>%
  mutate(surveyDate = as.Date(surveyDate))

# convert both datasets to sf objects (Spatial Features)
bleach_sf <- st_as_sf(bleaching_surveys_filtered, coords = c("longitude", "latitude"), crs = 4326)
ereefs_sf <- st_as_sf(ereefs_data_filtered, coords = c("long", "lat"), crs = 4326)

# spatial join within 1000 meters
joined_sf <- st_join(bleach_sf, ereefs_sf, join = st_is_within_distance, dist = 5000, left = TRUE)

# filter for rows where years match
joined_filtered <- joined_sf %>%
  mutate(
    bleach_year = year(surveyDate),
    env_year = Year
  ) %>%
  filter(bleach_year == env_year)

# convert back to data frame for further analysis or modeling
joined_df <- joined_filtered %>%
  as.data.frame()

dim(joined_df)
head(joined_df)
Code
write.csv(joined_df, "joined_df.csv", row.names = FALSE)

Heatmap

Code
joined_df$bleachCat_num <- as.numeric(as.character(joined_df$bleachCat))

cor_vars <- joined_df %>%
  select(temp, salt, ph, total_nitrogen, macroalgae, depth, bleachCat_num)

cor_matrix <- cor(cor_vars, use = "complete.obs", method = "spearman")

p_corr <- ggcorrplot(
  cor_matrix,
  method = "square",
  type = "lower",
  lab = TRUE,
  lab_size = 3.5,
  colors = c("darkred", "white", "darkblue"),
  title = "Correlation Heatmap: Environmental Variables vs Bleaching",
  ggtheme = theme_minimal()
)

p_corr

Code
ggsave("correlation_heatmap_bleaching.pdf", plot = p_corr, width = 7, height = 6)

Plot bleachCat vs temp

Code
# extract lat/lon from geometry column if not sf anymore
joined_df <- joined_df %>%
  mutate(
    longitude = as.numeric(gsub("c\\(([^,]+),.*", "\\1", geometry)),
    latitude  = as.numeric(gsub("c\\([^,]+,\\s*([^\\)]+)\\)", "\\1", geometry))
  )

# create GBR region labels
joined_df <- joined_df %>%
  mutate(region = case_when(
    latitude <= -9 & latitude > -16 ~ "Northern GBR",
    latitude <= -16 & latitude > -21 ~ "Central GBR",
    latitude <= -21 & latitude >= -24 ~ "Southern GBR",
    TRUE ~ NA_character_
  )) %>%
  filter(!is.na(region))

# plot temperature vs bleachCat by region
p1 <- ggplot(joined_df, aes(x = temp, y = as.numeric(as.character(bleachCat)), color = bleachCat)) +
  geom_jitter(alpha = 0.4, size = 1.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ region, ncol = 3) +
  labs(
    title = "Relationship between Temperature and Coral Bleaching Category",
    subtitle = "By Region of the Great Barrier Reef",
    x = "Sea Surface Temperature (°C)",
    y = "Bleaching Category (0–4)"
  ) +
  theme_minimal()

# plot total_nitrogen vs bleachCat by region
p2 <- ggplot(joined_df, aes(x = total_nitrogen, y = as.numeric(as.character(bleachCat)), color = bleachCat)) +
  geom_jitter(alpha = 0.4, size = 1.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ region, ncol = 3) +
  labs(
    title = "Relationship between Total Nitrogen and Coral Bleaching Category",
    subtitle = "By Region of the Great Barrier Reef",
    x = "Total Nitrogen (µmol/L)",
    y = "Bleaching Category (0–4)"
  ) +
  theme_minimal()

print(p1)
`geom_smooth()` using formula = 'y ~ x'

Code
print(p2)
`geom_smooth()` using formula = 'y ~ x'

Code
ggsave("temp_vs_bleaching_by_region.pdf", plot = p1, width = 10, height = 6)
`geom_smooth()` using formula = 'y ~ x'
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Bleaching Category (0–4)' in 'mbcsToSbcs': - substituted for – (U+2013)
Code
ggsave("nitrogen_vs_bleaching_by_region.pdf", plot = p2, width = 10, height = 6)
`geom_smooth()` using formula = 'y ~ x'
Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
for 'Bleaching Category (0–4)' in 'mbcsToSbcs': - substituted for – (U+2013)

Predictive Modeling

Code
# select features and target 
features <- c("temp", "salt", "total_nitrogen", "ph", "macroalgae", "depth")
target <- "bleachCat"

# prepare modeling dataframe 
modeling_df <- joined_df[, c(features, target)]
modeling_df <- modeling_df[complete.cases(modeling_df), ]

# fill missing values with column means
modeling_df[features] <- lapply(modeling_df[features], function(x) {
  ifelse(is.na(x), mean(x, na.rm = TRUE), x)
})

# make sure target is a factor (for classification)
modeling_df$bleachCat <- as.factor(modeling_df$bleachCat)

# 80/20 train-test split
set.seed(3888)
train_index <- createDataPartition(modeling_df$bleachCat, p = 0.8, list = FALSE)
train_data <- modeling_df[train_index, ]
test_data  <- modeling_df[-train_index, ]

cat("Train size:", nrow(train_data), "\n")
Train size: 2148 
Code
cat("Test size :", nrow(test_data), "\n")
Test size : 535 
Code
# define repeated cross-validation strategy (for model training) 
cv_control <- trainControl(
  method = "repeatedcv",
  number = 5,       # 5-fold cross-validation
  repeats = 3,      # repeated 3 times
  classProbs = TRUE,
  summaryFunction = multiClassSummary # optional; for classification models
)
Code
sum(is.na(modeling_df))
[1] 0
Code
colSums(is.na(modeling_df))
          temp           salt total_nitrogen             ph     macroalgae 
             0              0              0              0              0 
         depth      bleachCat 
             0              0 

Penalized Ordinal Regression

Code
vglm_model <- vglm(bleachCat ~ ., family = cumulative(parallel = TRUE), data = train_data)
Warning in eval(slot(family, "initialize")): response should be ordinal---see
ordered()
Code
vglm_preds <- predict(vglm_model, newdata = test_data, type = "response")
vglm_class <- apply(vglm_preds, 1, which.max) - 1  # assuming bleachCat is coded 0,1,2,...
vglm_acc <- mean(vglm_class == as.numeric(as.character(test_data$bleachCat)))
cat("Penalized Ordinal Regression Accuracy:", round(vglm_acc, 3), "\n")
Penalized Ordinal Regression Accuracy: 0.329 

SVM with Ordinal Labels

Code
# ensure bleachCat is numeric for ordinal SVM
train_data$bleachCat_ord <- as.numeric(as.character(train_data$bleachCat))
test_data$bleachCat_ord <- as.numeric(as.character(test_data$bleachCat))

svm_model <- svm(bleachCat_ord ~ ., data = train_data[, c(features, "bleachCat_ord")])
svm_preds <- round(predict(svm_model, test_data[, features]))
svm_acc <- mean(svm_preds == test_data$bleachCat_ord)
cat("Ordinal SVM Accuracy:", round(svm_acc, 3), "\n")
Ordinal SVM Accuracy: 0.379 

Ordinal Decision Tree

Code
# ensure bleachCat is an ordered factor 
train_data$bleachCat <- factor(train_data$bleachCat, ordered = TRUE)
test_data$bleachCat  <- factor(test_data$bleachCat, ordered = TRUE)

# fit an ordinal-aware decision tree using ctree 
tree_model <- ctree(bleachCat ~ temp + salt + total_nitrogen + ph + macroalgae + depth,
                    data = train_data)

# predict on the test set 
tree_preds <- predict(tree_model, newdata = test_data)

# convert to same type and compute accuracy 
tree_acc <- mean(as.character(tree_preds) == as.character(test_data$bleachCat))
cat("Ordinal Decision Tree (ctree) Accuracy:", round(tree_acc, 3), "\n")
Ordinal Decision Tree (ctree) Accuracy: 0.613 

Assess the performance on the models:

Confusion Matrices

Code
cat("\nConfusion Matrix: Penalized Ordinal Regression (VGAM)\n")

Confusion Matrix: Penalized Ordinal Regression (VGAM)
Code
confusionMatrix(factor(vglm_class), factor(as.numeric(as.character(test_data$bleachCat))))
Warning in levels(reference) != levels(data): longer object length is not a
multiple of shorter object length
Warning in confusionMatrix.default(factor(vglm_class),
factor(as.numeric(as.character(test_data$bleachCat)))): Levels are not in the
same order for reference and data. Refactoring data to match.
Confusion Matrix and Statistics

          Reference
Prediction   0   1   2   3   4
         0 125  57  41  82  22
         1   0   0   0   0   0
         2   0   0   0   0   0
         3  64  18  41  51  34
         4   0   0   0   0   0

Overall Statistics
                                          
               Accuracy : 0.329           
                 95% CI : (0.2893, 0.3706)
    No Information Rate : 0.3533          
    P-Value [Acc > NIR] : 0.8894          
                                          
                  Kappa : 0.0239          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
Sensitivity            0.6614   0.0000   0.0000  0.38346   0.0000
Specificity            0.4162   1.0000   1.0000  0.60945   1.0000
Pos Pred Value         0.3823      NaN      NaN  0.24519      NaN
Neg Pred Value         0.6923   0.8598   0.8467  0.74924   0.8953
Prevalence             0.3533   0.1402   0.1533  0.24860   0.1047
Detection Rate         0.2336   0.0000   0.0000  0.09533   0.0000
Detection Prevalence   0.6112   0.0000   0.0000  0.38879   0.0000
Balanced Accuracy      0.5388   0.5000   0.5000  0.49646   0.5000
Code
cat("\nConfusion Matrix: SVM\n")

Confusion Matrix: SVM
Code
confusionMatrix(factor(svm_preds), factor(test_data$bleachCat_ord))
Confusion Matrix and Statistics

          Reference
Prediction  0  1  2  3  4
         0 51  3  0  0  1
         1 54 39 29  7  0
         2 59 29 43 57 19
         3 25  4 10 69 35
         4  0  0  0  0  1

Overall Statistics
                                          
               Accuracy : 0.3794          
                 95% CI : (0.3382, 0.4221)
    No Information Rate : 0.3533          
    P-Value [Acc > NIR] : 0.1114          
                                          
                  Kappa : 0.2281          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
Sensitivity           0.26984   0.5200  0.52439   0.5188 0.017857
Specificity           0.98844   0.8043  0.63797   0.8159 1.000000
Pos Pred Value        0.92727   0.3023  0.20773   0.4825 1.000000
Neg Pred Value        0.71250   0.9113  0.88110   0.8367 0.897004
Prevalence            0.35327   0.1402  0.15327   0.2486 0.104673
Detection Rate        0.09533   0.0729  0.08037   0.1290 0.001869
Detection Prevalence  0.10280   0.2411  0.38692   0.2673 0.001869
Balanced Accuracy     0.62914   0.6622  0.58118   0.6674 0.508929
Code
cat("\nConfusion Matrix: Decision Tree\n")

Confusion Matrix: Decision Tree
Code
confusionMatrix(factor(tree_preds), factor(test_data$bleachCat))
Confusion Matrix and Statistics

          Reference
Prediction   0   1   2   3   4
         0 153  18   4   4   4
         1  16  39  25   3   1
         2   0   5  23  11   0
         3  20  12  30 101  39
         4   0   1   0  14  12

Overall Statistics
                                          
               Accuracy : 0.6131          
                 95% CI : (0.5703, 0.6546)
    No Information Rate : 0.3533          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.4819          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
Sensitivity            0.8095   0.5200  0.28049   0.7594  0.21429
Specificity            0.9133   0.9022  0.96468   0.7488  0.96868
Pos Pred Value         0.8361   0.4643  0.58974   0.5000  0.44444
Neg Pred Value         0.8977   0.9202  0.88105   0.9039  0.91339
Prevalence             0.3533   0.1402  0.15327   0.2486  0.10467
Detection Rate         0.2860   0.0729  0.04299   0.1888  0.02243
Detection Prevalence   0.3421   0.1570  0.07290   0.3776  0.05047
Balanced Accuracy      0.8614   0.7111  0.62258   0.7541  0.59149

Accuracy Table

Code
# create accuracy dataframe using stored accuracy values
accuracy_df <- data.frame(
  Model = c("Penalized Ordinal Regression (VGAM)", 
            "SVM (Ordinal Labels)", 
            "Ordinal Decision Tree"),
  Accuracy = c(vglm_acc, svm_acc, tree_acc)
)

print(accuracy_df)
                                Model  Accuracy
1 Penalized Ordinal Regression (VGAM) 0.3289720
2                SVM (Ordinal Labels) 0.3794393
3               Ordinal Decision Tree 0.6130841

Bar Chart

Code
p_accuracy <- ggplot(accuracy_df, aes(x = reorder(Model, Accuracy), y = Accuracy)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_text(aes(label = round(Accuracy, 3)), vjust = -0.5, size = 4) +
  labs(title = "Model Accuracy Comparison",
       x = "Model",
       y = "Accuracy") +
  ylim(0, 1) +
  theme_minimal() +
  coord_flip()

p_accuracy

Code
ggsave("model_accuracy_comparison.pdf", plot = p_accuracy, width = 8, height = 5)

Prediction on Future Data (2018-2019)

Code
# filter eReefs data for 2018–2020 
eReefs_df$time <- as.Date(eReefs_df$time)

ereefs_future <- eReefs_df %>%
  filter(year(time) >= 2018 & year(time) <= 2020)

cat("Filtered future data:", nrow(ereefs_future), "rows\n")
Filtered future data: 226168 rows
Code
# ensure required features exist 
features <- c("temp", "salt", "total_nitrogen", "ph", "macroalgae", "depth")

if (!"depth" %in% colnames(ereefs_future)) {
  ereefs_future$depth <- median(train_data$depth, na.rm = TRUE)
}

# remove rows with NA in predictors 
ereefs_future <- ereefs_future[complete.cases(ereefs_future[, features]), ]

# predict using trained decision tree model 
future_preds <- predict(tree_model, newdata = ereefs_future)
ereefs_future$predicted_bleachCat <- future_preds

# add year and region 
ereefs_future$year <- year(ereefs_future$time)

ereefs_future$region <- case_when(
  ereefs_future$lat <= -9 & ereefs_future$lat > -16 ~ "Northern GBR",
  ereefs_future$lat <= -16 & ereefs_future$lat > -21 ~ "Central GBR",
  ereefs_future$lat <= -21 & ereefs_future$lat >= -24 ~ "Southern GBR",
  TRUE ~ "Outside Bounds"
)

ereefs_future <- ereefs_future %>% filter(region != "Outside Bounds")

# preview and export 
head(ereefs_future[, c("time", "year", "region", "long", "lat", "predicted_bleachCat")], 10)
         time year       region     long       lat predicted_bleachCat
1  2018-01-01 2018 Southern GBR 152.6688 -23.98602                   0
2  2018-01-01 2018 Southern GBR 152.6988 -23.98602                   0
3  2018-01-01 2018 Southern GBR 152.7288 -23.98602                   0
4  2018-01-01 2018 Southern GBR 152.7588 -23.98602                   0
5  2018-01-01 2018 Southern GBR 152.7888 -23.98602                   0
6  2018-01-01 2018 Southern GBR 152.8188 -23.98602                   0
7  2018-01-01 2018 Southern GBR 152.8488 -23.98602                   0
8  2018-01-01 2018 Southern GBR 152.8788 -23.98602                   0
9  2018-01-01 2018 Southern GBR 152.9088 -23.98602                   0
10 2018-01-01 2018 Southern GBR 152.9388 -23.98602                   0
Code
write.csv(ereefs_future, "ereefs_predictions_2018_2020.csv", row.names = FALSE)

Approximate “stress over time” using rolling or max values

Code
ereefs_future_summary <- ereefs_future %>%
  group_by(lat, long, year, region) %>%
  summarise(
    max_bleach = max(predicted_bleachCat, na.rm = TRUE),
    mean_temp = mean(temp, na.rm = TRUE),
    mean_nitrogen = mean(total_nitrogen, na.rm = TRUE),
    mean_ph = mean(ph, na.rm = TRUE),
    .groups = "drop"
  )

Visualizations

Predicted Bleaching Severity (2018–2020)

Code
# prepare data for plotting 
ereefs_future$year <- year(ereefs_future$time)
ereefs_future$region <- case_when(
  ereefs_future$lat <= -9 & ereefs_future$lat > -16 ~ "Northern GBR",
  ereefs_future$lat <= -16 & ereefs_future$lat > -21 ~ "Central GBR",
  ereefs_future$lat <= -21 & ereefs_future$lat >= -24 ~ "Southern GBR",
  TRUE ~ "Outside Bounds"
)

# restrict to GBR lat/lon range
plot_df <- ereefs_future %>%
  filter(lat <= -9 & lat >= -24)  # optional cleaning

p_bleach_map <- ggplot(plot_df, aes(x = long, y = lat, color = as.numeric(predicted_bleachCat))) +
  geom_point(alpha = 0.8, size = 1.5) +
  scale_color_viridis_c(name = "Bleaching Category", option = "viridis") +
  facet_wrap(~ year, ncol = 3) +
  theme_minimal() +
  labs(
   title = "Predicted Coral Bleaching Severity (2018-2020) Across Latitude Bands",
    x = "Longitude",
    y = "Latitude"
  ) +
  geom_hline(yintercept = c(-16, -21), linetype = "dashed", color = "red", size = 0.6) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14),
    legend.position = "right"
  )

p_bleach_map

Code
ggsave("predicted_bleaching_latitude_map_2018_2020.pdf", plot = p_bleach_map, width = 10, height = 6, dpi = 300)
Code
# STEP 3: Plot temperature vs bleachCat by region
p1 <- ggplot(joined_df, aes(x = temp, y = as.numeric(as.character(bleachCat)), color = bleachCat)) +
  geom_jitter(alpha = 0.4, size = 1.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ region, ncol = 3) +
  labs(
    title = "Relationship between Temperature and Coral Bleaching Category",
    subtitle = "By Region of the Great Barrier Reef",
    x = "Sea Surface Temperature (°C)",
    y = "Bleaching Category (0–4)"
  ) +
  theme_minimal()

# STEP 4: Plot total_nitrogen vs bleachCat by region
p2 <- ggplot(joined_df, aes(x = total_nitrogen, y = as.numeric(as.character(bleachCat)), color = bleachCat)) +
  geom_jitter(alpha = 0.4, size = 1.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ region, ncol = 3) +
  labs(
    title = "Relationship between Total Nitrogen and Coral Bleaching Category",
    subtitle = "By Region of the Great Barrier Reef",
    x = "Total Nitrogen (µmol/L)",
    y = "Bleaching Category (0–4)"
  ) +
  theme_minimal()

print(p1)
`geom_smooth()` using formula = 'y ~ x'

Code
print(p2)
`geom_smooth()` using formula = 'y ~ x'

Model/plot bleaching vs aggregated environmental exposure

Code
p_temp_bleach <- ggplot(ereefs_future_summary, aes(x = mean_temp, y = max_bleach, color = region)) +
  geom_jitter(width = 0.2, height = 0.2, alpha = 0.5) +
  geom_smooth(method = "loess", se = FALSE) +
  facet_wrap(~ region) +
  labs(
    title = "Mean Temperature vs Max Predicted Bleaching (2018-2020)",
    x = "Mean Temperature (°C)",
    y = "Max Predicted Bleaching"
  ) +
  theme_minimal()

p_temp_bleach
`geom_smooth()` using formula = 'y ~ x'

Code
ggsave("mean_temp_vs_max_bleaching_by_region.pdf", plot = p_temp_bleach, width = 10, height = 6, dpi = 300)
`geom_smooth()` using formula = 'y ~ x'

Boxplots for Each Feature by Region

Code
# ensure region classification
ereefs_future$region <- case_when(
  ereefs_future$lat <= -9 & ereefs_future$lat > -16 ~ "Northern GBR",
  ereefs_future$lat <= -16 & ereefs_future$lat > -21 ~ "Central GBR",
  ereefs_future$lat <= -21 & ereefs_future$lat >= -24 ~ "Southern GBR",
  TRUE ~ NA_character_
)

# convert to factor for consistent ordering
ereefs_future$region <- factor(ereefs_future$region, levels = c("Northern GBR", "Central GBR", "Southern GBR"))

# list of environmental features to plot
env_features <- c("temp", "salt", "total_nitrogen", "ph", "macroalgae", "depth")

# loop through features and plot
library(ggplot2)

for (feature in env_features) {
  p <- ggplot(ereefs_future, aes(x = region, y = .data[[feature]], fill = region)) +
    geom_boxplot(alpha = 0.8) +
    labs(
      title = paste("Distribution of", feature, "by GBR Region"),
      x = "Region",
      y = feature
    ) +
    theme_minimal() +
    theme(legend.position = "none")
  print(p)
}

Correlation Between Environmental Factors and Predicted Bleaching by Region

Code
ereefs_future$predicted_bleachCat <- as.numeric(as.character(ereefs_future$predicted_bleachCat))

env_features <- c("temp", "salt", "total_nitrogen", "ph", "macroalgae", "depth")

region_cor_df <- ereefs_future %>%
  filter(!is.na(region)) %>%
  group_by(region) %>%
  summarise(across(all_of(env_features),
                   ~ cor(.x, predicted_bleachCat, method = "spearman", use = "complete.obs")))
Warning: There were 3 warnings in `summarise()`.
The first warning was:
ℹ In argument: `across(...)`.
ℹ In group 1: `region = Northern GBR`.
Caused by warning in `cor()`:
! the standard deviation is zero
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Code
print(region_cor_df)
# A tibble: 3 × 7
  region         temp     salt total_nitrogen      ph macroalgae depth
  <fct>         <dbl>    <dbl>          <dbl>   <dbl>      <dbl> <dbl>
1 Northern GBR 0.732   0.350          -0.656  -0.725     0.203      NA
2 Central GBR  0.565   0.00569        -0.430  -0.464     0.226      NA
3 Southern GBR 0.0722 -0.0120         -0.0143 -0.0307   -0.00350    NA

Visualize Correlation as Heatmap

Code
region_cor_long <- region_cor_df %>%
  pivot_longer(-region, names_to = "Variable", values_to = "Correlation")

p_region_cor <- ggplot(region_cor_long, aes(x = Variable, y = region, fill = Correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "darkred", midpoint = 0) +
  labs(title = "Spearman Correlation: Environmental Factors vs Predicted Bleaching",
       x = "Environmental Variable", y = "Region") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_region_cor

Code
ggsave("correlation_heatmap_by_region.pdf", plot = p_region_cor, width = 8, height = 5.5, dpi = 300)